home *** CD-ROM | disk | FTP | other *** search
- /* WIDE AREA INFORMATION SERVER SOFTWARE
- No guarantees or restrictions. See the readme file for the full standard
- disclaimer.
- Brewster@think.com
- */
-
-
- /* implements the search part of irext.h
- (search_word and finished_search_word)
- -brewster
-
- Split from irsearch.c
-
- 5/31/91 Added scale_scores. Fixed document_score_array to long.
- 7/8/91 Removed scale_scores, handled in search_word with doc_id > 0.
- 2/4/92 Made document_score_array a double.
-
- - Jonny G
- * $Log: sersrch.c,v $
- * Revision 1.24 92/04/28 16:56:54 morris
- * added boolean to serial engine
- *
- * Revision 1.23 92/03/15 10:15:18 jonathan
- * Added Simon Spero's ASSIGN replacement for read_bytes.
- *
- * Revision 1.22 92/03/05 07:09:54 shen
- * add two more dummy arguments to call to init_search_engine
- *
- * Revision 1.21 92/02/12 17:29:52 jonathan
- * Conditionalized inclusion of object code.
- *
- * Revision 1.20 92/02/12 13:40:06 jonathan
- * Added "$Log" so RCS will put the log message in the header
- *
- */
-
- #include "cdialect.h"
- #include "irfiles.h"
- #include "irsearch.h"
- #include "irext.h"
- #include "byte_order.h"
- #include <string.h>
-
- #ifdef BOOL
- #include "obj.h"
- #include "irparse.h"
- object* currentQuery = NULL; /* kludge until irext goes away */
- #endif /* def BOOL */
-
- /* weighting for relevant document terms -
- this may become a parameter to the query.
- */
-
- #define RF_WEIGHTING 0.1
-
- /* ==================================
- * === Initialization Functions ===
- * ==================================*/
-
-
- long init_search_engine(file, initialize, for_search, cm_mem_percent,
- text_size, grow_percent)
- char* file;
- boolean initialize;
- boolean for_search;
- long cm_mem_percent; /* unused */
- long text_size; /* unused */
- long grow_percent; /* unused */
- {
- static boolean inited = false;
-
- if (inited == false)
- {
- #ifdef BOOL
- initObj();
- initBool();
- #endif
- inited = true;
- }
-
- return(0);
- }
-
- long finished_search_engine()
- {
- return(0);
- }
-
-
- /*
- * ext_open_database: see irext.h
- */
-
- long ext_open_database (db, initialize, for_search)
- database *db;
- boolean initialize;
- boolean for_search;
- { /* this has to deal with the .inv file */
- char file[MAX_FILE_NAME_LEN];
-
- if(initialize) /* make a new one */
- db->index_stream = s_fopen(index_filename(file, db), "w+b");
- else if(for_search) /* just search */
- db->index_stream = s_fopen(index_filename(file, db), "rb");
- else /* write to an existing db */
- db->index_stream = s_fopen(index_filename(file, db), "r+b");
-
- if (db->index_stream == NULL) {
- waislog(WLOG_HIGH, WLOG_ERROR,"2can't open the inverted index file %s\n",
- file);
- disposeDatabase(db);
- return(1);
- }
- return(0);
- }
-
-
-
- /*
- * ext_close_database: see irext.h
- */
-
- long ext_close_database (db)
- database *db;
- {
- return(0);
- }
-
- char *database_file(database_name)
- char *database_name;
- {
- return(database_name);
- }
-
- /*===========================*
- *=== Setting Paramters ===*
- *===========================*/
-
- long max_hit_retrieved = 0;
- char **srcs = NULL;
-
- long set_query_parameter (mask, parameters)
- long mask;
- query_parameter_type * parameters;
- {
- switch (mask)
- {
- case SET_MAX_RETRIEVED_MASK:
- max_hit_retrieved = parameters->max_hit_retrieved;
- return(0);
- break;
- case SET_SELECT_SOURCE:
- if(NULL != srcs){
- if(NULL != srcs[0])
- s_free(srcs[0]);
- s_free(srcs);
- }
- srcs = parameters->srcs;
- break;
- default:
- return(-1);
- break;
- }
- return(0);
- }
-
- /*==============================*
- *=== Document Score Array ===*
- *==============================*/
-
- double *document_score_array = NULL;
- long document_score_array_len = 0;
-
- /* make_document_score_array insures that the document_score_array
- array is long enough, if not it makes it long enough */
- static void make_document_score_array _AP((long length));
- static void make_document_score_array(length)
- long length;
- {
- if(length <= document_score_array_len)
- return;
- /* we have to make a new one. free the old one first (if any) */
- if(document_score_array != 0){
- s_free(document_score_array);
- }
- document_score_array = (double*)s_malloc(
- (size_t)(length * sizeof(double)));
- document_score_array_len = length;
- }
-
- static void destroy_document_score_array _AP((void));
- static void destroy_document_score_array()
- {
- s_free(document_score_array);
- document_score_array_len = 0;
- }
-
- void clear_document_score_array()
- /* side effects the document_score_array. */
- {
- memset(document_score_array, 0,
- document_score_array_len * sizeof(double));
- }
-
- /* for debugging purposes */
- void print_document_score_array(start,stop)
- unsigned long start;
- unsigned long stop;
- /* assumes start >= 0, stop < db->doc_table_allocated_entries */
- {
- long i;
- for(i = start; i <= stop; i++){
- printf("entry number %d: %f \n",
- i, document_score_array[i]);
- }
- }
-
-
-
- /*=========================*
- *=== Best Hits Array ===*
- *=========================*/
-
- hit *best_hits_array = NULL;
- long best_hits_array_len = 0;
- long current_best_hit = 0;
-
- /* see irext.h for doc */
- long init_best_hit (db)
- database *db;
- {
-
- #ifdef BOOL
- if (currentQuery != NULL)
- send(currentQuery,InitBestHit,db);
- #endif /* def BOOL */
-
- return(0);
- }
-
- /* make_best_hits_array insures that the best_hits_array
- array is long enough, if not it makes it long enough */
- static void make_best_hits_array _AP((long length));
- static void make_best_hits_array(length)
- long length;
- {
- if(length <= best_hits_array_len)
- return;
- /* we have to make a new one. free the old one first (if any) */
- if(best_hits_array != 0){
- s_free(best_hits_array);
- }
- best_hits_array = (hit*)s_malloc((size_t)(length * sizeof(hit)));
- best_hits_array_len = length;
- }
-
- static void destroy_best_hits_array _AP((void));
- static void destroy_best_hits_array()
- {
- s_free(best_hits_array);
- best_hits_array_len = 0;
- }
-
- void clear_best_hits_array()
- /* side effects the best_hits_array. XXX could use memset */
- {
- memset((char*)best_hits_array, 0, best_hits_array_len * sizeof(hit));
- }
-
- /* for debugging purposes */
- void print_best_hits()
- {
- long i;
- for( i = 0; i < best_hits_array_len; i++){
- if (best_hits_array[i].weight != 0)
- { printf("Best hit %ld: weight %ld, doc_id %ld, headline %s, filename %s, lines %ld\n",
- i, best_hits_array[i].weight,
- best_hits_array[i].document_id,
- best_hits_array[i].headline,
- best_hits_array[i].filename,
- best_hits_array[i].number_of_lines);
- }
- }
- }
-
- void sort_best_hits(db)
- database * db;
- {
- /* returns nothing.
- * side effects best_hits and document_score_array
- */
-
- long i, doc;
- double worst_weight_to_make_it = 0.0;
- document_table_entry doc_entry;
- long best_hit_number = 0;
-
- /* snuff the scores */
- for(i = 0; i < max_hit_retrieved; i++){
- best_hits_array[i].weight = 0;
- }
-
- /* loop over the doc, and keep the doc_id and weight in best hit table */
- for(doc = 0; doc < db->doc_table_allocated_entries; doc++){
- double weight = document_score_array[doc];
- if(worst_weight_to_make_it < weight){
- /* merge it into the best_hits array. start at the bottom */
- for(i = (max_hit_retrieved - 1); i >= 0; i--){
- if(weight > best_hits_array[i].weight
- /* && (check_document_id(doc, db) == true) too slow.*/
- ){
- /* move this entry down */
- if((i + 1) < max_hit_retrieved){
- best_hits_array[i+1].weight = best_hits_array[i].weight;
- best_hits_array[i+1].document_id = best_hits_array[i].document_id;
- }
- best_hits_array[i].document_id = doc;
- best_hits_array[i].weight = weight;
- }
- else
- break;
- }
- }
- }
-
- for(i = 0; i < max_hit_retrieved; i++){
- if(best_hits_array[i].weight <= 0) /* if we are out of good stuff, return */
- return;
- /* fill in the rest of the hit */
- if (read_document_table_entry(&doc_entry,
- best_hits_array[i].document_id,
- db)
- == true){
- best_hits_array[best_hit_number].weight = best_hits_array[i].weight;
- best_hits_array[best_hit_number].document_id = best_hits_array[i].document_id;
- best_hits_array[best_hit_number].start_character = doc_entry.start_character;
- best_hits_array[best_hit_number].end_character = doc_entry.end_character;
- best_hits_array[best_hit_number].document_length = doc_entry.document_length;
- best_hits_array[best_hit_number].number_of_lines = doc_entry.number_of_lines;
- read_filename_table_entry(doc_entry.filename_id,
- best_hits_array[best_hit_number].filename,
- best_hits_array[best_hit_number].type,
- NULL,
- db),
- strncpy(best_hits_array[best_hit_number].headline,
- read_headline_table_entry(doc_entry.headline_id,db),
- MAX_FILE_NAME_LEN);
- best_hit_number++;
- }
- beFriendly();
- }
- for(i = best_hit_number; i < max_hit_retrieved; i++){
- best_hits_array[best_hit_number].weight = 0;
- }
- /* print_best_hits(s); for debugging */
- }
-
-
- /* returns the next best hit */
- long best_hit(db, doc_id, best_character, best_line, score)
- database *db;
- long *doc_id;
- long *best_character;
- long *best_line;
- long *score;
- {
-
- *best_character = 0;
- *best_line = 0;
-
- #ifdef BOOL
- if (currentQuery != NULL) /* for boolean */
- {
- send(currentQuery,GetBestHit,db,doc_id,best_character,best_line,score);
- if (*doc_id > 0)
- return(0); /* ok */
- else
- return(-1); /* no more docs */
- }
- #endif /* BOOL */
-
- if(current_best_hit > best_hits_array_len)
- return(1);
- if(best_hits_array[current_best_hit].weight == 0)
- return(1);
- *doc_id = best_hits_array[current_best_hit].document_id;
- *score = best_hits_array[current_best_hit].weight;
- current_best_hit++;
- return(0);
- }
-
- long finished_best_hit(db)
- database *db;
- {
-
- #ifdef BOOL
- if (currentQuery != NULL) /* for boolean */
- { send(currentQuery,Delete);
- currentQuery = NULL;
- return(0);
- }
- #endif /* BOOL */
-
- /* if we are on a small machine, we might want to
- destroy_document_score_array */
- clear_document_score_array();
- clear_best_hits_array();
- current_best_hit = 0;
- return(0);
- }
-
- /*=============================*
- *=== Searching for words ===*
- *=============================*/
-
- /* see irext.h for doc */
- long init_search_word (db)
- database* db;
- {
- return(0);
- }
-
- /* see irext.h for doc */
- long search_word(word,char_pos, line_pos, weight, doc_id,
- word_pair, db)
- char *word; /* the word to be searched for */
- long char_pos; /* the position of the start of the word */
- long line_pos; /* is this needed? not for signature system */
- long weight; /* how important the word looks syntactically,
- such as is it bold */
- long doc_id; /* current document, seed words is 0,
- then it increments into the relevant
- document */
- long word_pair;
- database *db;
- {
- /* this side effects the document_score_array,
- * and downcases the word.
- * Returns 0 if successful or word not present,
- * returns non-0 if an error.
- *
- */
-
- long not_full_flag = INDEX_BLOCK_FULL_FLAG; /*start out full so it will go on looking */
- long count, index_block_size;
- long internal_document_id, internal_weight, number_of_valid_entries;
- long index_file_block_number;
- long number_of_occurances;
-
- FOUR_BYTE index_buffer_data[INDEX_ELEMENT_SIZE*(1024/4)];
- char *index_buffer;
- char *i;
- FILE *stream = NULL;
-
- index_buffer = (char*)index_buffer_data;
-
- index_file_block_number =
- look_up_word_in_dictionary(word, &number_of_occurances, db);
-
- current_best_hit = 0; /* so that the best hits willstart from 0 */
-
- /* check the document_score_array */
- if(document_score_array_len < db->doc_table_allocated_entries)
- make_document_score_array(db->doc_table_allocated_entries);
-
- if(index_file_block_number >= 0){
- stream = db->index_stream;
-
- while((not_full_flag != INDEX_BLOCK_NOT_FULL_FLAG) &&
- (index_file_block_number != 0)){
- /* read the index block */
- if (0 != fseek(stream, (long)index_file_block_number,
- SEEK_SET))
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "fseek failed into the inverted file to position %ld",
- (long)index_file_block_number);
- return(-1);
- }
-
- read(fileno(stream),index_buffer,INDEX_BLOCK_HEADER_SIZE);
-
- ASSIGN(not_full_flag,
- INDEX_BLOCK_FLAG_SIZE,
- index_buffer,
- INDEX_BLOCK_HEADER_SIZE,
- 0 );
- ASSIGN(index_file_block_number,NEXT_INDEX_BLOCK_SIZE,
- index_buffer+INDEX_BLOCK_FLAG_SIZE,
- INDEX_BLOCK_HEADER_SIZE,
- INDEX_BLOCK_FLAG_SIZE);
- ASSIGN(index_block_size,INDEX_BLOCK_SIZE_SIZE,
- index_buffer+INDEX_BLOCK_FLAG_SIZE+NEXT_INDEX_BLOCK_SIZE,
- INDEX_BLOCK_HEADER_SIZE,
- INDEX_BLOCK_FLAG_SIZE+NEXT_INDEX_BLOCK_SIZE);
-
- /*
- this is equivalent, but slower:
- not_full_flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, stream);
- index_file_block_number = read_bytes(NEXT_INDEX_BLOCK_SIZE, stream);
- index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, stream);
-
- printf("flag = %d, block_num = %d, block_size = %d\n",
- not_full_flag,
- index_file_block_number,
- index_block_size);
-
- fflush(stdout);
- */
- if(EOF == index_block_size)
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "reading from the index file failed");
- return(-1);
- }
-
- if(not_full_flag == INDEX_BLOCK_NOT_FULL_FLAG){
- /* not full */
- number_of_valid_entries = index_file_block_number;
- }
- else if(not_full_flag == INDEX_BLOCK_FULL_FLAG){
- /* full */
- number_of_valid_entries = index_block_size - INDEX_BLOCK_HEADER_SIZE;
- }
- else{ /* bad news, file is corrupted. */
- waislog(WLOG_HIGH, WLOG_ERROR,
- "Expected the flag in the inverted file to be valid. it is %ld",
- not_full_flag);
- return(-1);
- }
- /* printf("number of valid bytes: %ld\n", number_of_valid_entries); */
-
- /* add the array to the document_score_array */
- number_of_valid_entries /= INDEX_ELEMENT_SIZE;
-
- for(count=0;count < number_of_valid_entries;count++) {
- int wgt;
- int did;
- if(count%1024 == 0) {
- read(fileno(stream),index_buffer,INDEX_ELEMENT_SIZE*
- MIN(1024,number_of_valid_entries-count));
- i=index_buffer;
- }
- ASSIGN(wgt,WEIGHT_SIZE,
- i+DOCUMENT_ID_SIZE+WORD_POSITION_SIZE+CHARACTER_POSITION_SIZE,
- INDEX_ELEMENT_SIZE,
- DOCUMENT_ID_SIZE+WORD_POSITION_SIZE+CHARACTER_POSITION_SIZE);
- ASSIGN(did,DOCUMENT_ID_SIZE,i,INDEX_ELEMENT_SIZE,0);
-
- internal_weight = wgt;
- internal_document_id = did;
-
- /*
- printf("entry %ld, Doc_id: %ld, weight %ld \n",
- count, internal_document_id, internal_weight);
- fflush(stdout);
- */
- if(EOF == internal_weight)
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "reading from the doc-id table failed");
- return(-1);
- }
- /* if(doc_id > 0) we are doing a relevant document */
- document_score_array[internal_document_id] +=
- (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight;
- i+=INDEX_ELEMENT_SIZE;
- }
- }
- return(0);
- }
- else if(0 == index_file_block_number){
- /* an error occurred on looking up the word */
- return(-1);
- }
- else /* index_file_block_number is negative */
- return(0); /* word not present */
- }
-
- /* now collect the best hits */
- long finished_search_word(db)
- database *db;
- {
-
- #ifdef BOOL
- if (currentQuery != NULL)
- return; /* do nothing for boolean */
- #endif /* def BOOL */
-
- /* check the document_score_array */
- if(document_score_array_len < db->doc_table_allocated_entries)
- make_document_score_array(db->doc_table_allocated_entries);
-
- make_best_hits_array(max_hit_retrieved);
- sort_best_hits(db);
- return(0);
- }
-
-